##Replication Script for "Who Are These People"
rm(list=ls())

#-------------------------------------------------------------------
#Setting Working Directory
setwd("/Users/connorhuff/Dropbox/")
setwd("C:/Users/dtingley/Dropbox")
setwd("TingleyConnorHuff/MTurkVsCCES/RAP Final/Replication File")
getwd()


#-------------------------------------------------------------------
ipak <- function(pkg){
  new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
  if(length(new.pkg)) install.packages(new.pkg, dependencies = TRUE)
  sapply(pkg, require, character.only = TRUE)
}

packages <- c("foreign","stringr", "xtable", "ggplot2","gdata", "gam", "descr", "survey", "sp", "maps", "maptools", "zipcode", "gridExtra",
              "plyr", "ggplot2", "scales", "RColorBrewer", "rgdal", "car")
ipak(packages)

#-------------------------------------------------------------------
#Loading the data: the data contains the CCES data stacked on top of the mTurk data denoted by the variable "Dataset"
data <- read.csv("Replication_Data.csv")
cces.data <- data[1:1300,] ##cces.data is the first 1300 observations
mturk.data <- data[1301:4006,] ##mturk.data is the next 2706



#=============================================================================================================================================
#============================================================Figure 1=========================================================================
#=============================================================================================================================================
#Using the CCES data to generate figure one (CCES weights)


#Creating an indicator variable for being born before or after the mean in all the data
cces.data$younger <- as.numeric(cces.data$birthyear >= mean(data$birthyear)) 
cces.data.younger <- cces.data[cces.data$younger==1,]
cces.data.older <- cces.data[cces.data$younger==0,]

#Plotting
pdf("Figure1.pdf")
par(mfrow=c(1,1))
plot(density(cces.data.older$survey.weights, na.rm=T), col="black", main="", xlab="Weight")
lines(density(cces.data.younger$survey.weights, na.rm=T), col="black", lty=2)
legend("topright", c("Born in 1974 or Later", "Born Before 1974"), lty=c(2,1), cex=0.8, col=c("black", "black"))
dev.off()






#=============================================================================================================================================
#============================================================Figure 2=========================================================================
#=============================================================================================================================================
#Generating the Mosaic Plots for figure 2 

#-------------------------------------------------------------------
#creating an indicator variable for being born before or after the mean of all data for cces and mturk respectively
cces.data$younger <- as.numeric(cces.data$birthyear>=mean(data$birthyear)) 
mturk.data$younger <- as.numeric(mturk.data$birthyear>=mean(data$birthyear)) 

#-------------------------------------------------------------------
#Subsetting the data but race

#CCES
white.cces.data <- cces.data[cces.data$white==1,]
black.cces.data <- cces.data[cces.data$black==1,]
asian.cces.data <- cces.data[cces.data$asian==1,]
#converting NAs to zero
cces.data$final.hispanic[is.na(cces.data$final.hispanic)] <- 0
hispanic.cces.data <- cces.data[cces.data$final.hispanic==1,]

#Mturk
white.mturk.data <- mturk.data[mturk.data$white==1,]
black.mturk.data <- mturk.data[mturk.data$black==1,]
asian.mturk.data <- mturk.data[mturk.data$asian==1,]
#converting NAs to zero
mturk.data$final.hispanic[is.na(mturk.data$final.hispanic)] <- 0
hispanic.mturk.data <- mturk.data[mturk.data$final.hispanic==1,]


#-------------------------------------------------------------------
#Making new factor variables that will be run through crosstab

#All races
mturk.data$gender.f <- recode(mturk.data$gender, "c('1')='Male'; c('0')='Female'", as.factor.result=TRUE)
mturk.data$younger.f <- recode(mturk.data$younger, "c('1')='Younger'; c('0')='Older'", as.factor.result=TRUE)
cces.data$gender.f <- recode(cces.data$gender, "c('1')='Male'; c('0')='Female'", as.factor.result=TRUE)
cces.data$younger.f <- recode(cces.data$younger, "c('1')='Younger'; c('0')='Older'", as.factor.result=TRUE)


#White
white.mturk.data$gender.f <- recode(white.mturk.data$gender, "c('1')='Male'; c('0')='Female'", as.factor.result=TRUE)
white.mturk.data$younger.f <- recode(white.mturk.data$younger, "c('1')='Younger'; c('0')='Older'", as.factor.result=TRUE)
white.cces.data$gender.f <- recode(white.cces.data$gender, "c('1')='Male'; c('0')='Female'", as.factor.result=TRUE)
white.cces.data$younger.f <- recode(white.cces.data$younger, "c('1')='Younger'; c('0')='Older'", as.factor.result=TRUE)

#Black
black.mturk.data$gender.f <- recode(black.mturk.data$gender, "c('1')='Male'; c('0')='Female'", as.factor.result=TRUE)
black.mturk.data$younger.f <- recode(black.mturk.data$younger, "c('1')='Younger'; c('0')='Older'", as.factor.result=TRUE)
black.cces.data$gender.f <- recode(black.cces.data$gender, "c('1')='Male'; c('0')='Female'", as.factor.result=TRUE)
black.cces.data$younger.f <- recode(black.cces.data$younger, "c('1')='Younger'; c('0')='Older'", as.factor.result=TRUE)

#Hispanic
hispanic.mturk.data$gender.f <- recode(hispanic.mturk.data$gender, "c('1')='Male'; c('0')='Female'", as.factor.result=TRUE)
hispanic.mturk.data$younger.f <- recode(hispanic.mturk.data$younger, "c('1')='Younger'; c('0')='Older'", as.factor.result=TRUE)
hispanic.cces.data$gender.f <- recode(hispanic.cces.data$gender, "c('1')='Male'; c('0')='Female'", as.factor.result=TRUE)
hispanic.cces.data$younger.f <- recode(hispanic.cces.data$younger, "c('1')='Younger'; c('0')='Older'", as.factor.result=TRUE)


#Asian
asian.mturk.data$gender.f <- recode(asian.mturk.data$gender, "c('1')='Male'; c('0')='Female'", as.factor.result=TRUE)
asian.mturk.data$younger.f <- recode(asian.mturk.data$younger, "c('1')='Younger'; c('0')='Older'", as.factor.result=TRUE)
asian.cces.data$gender.f <- recode(asian.cces.data$gender, "c('1')='Male'; c('0')='Female'", as.factor.result=TRUE)
asian.cces.data$younger.f <- recode(asian.cces.data$younger, "c('1')='Younger'; c('0')='Older'", as.factor.result=TRUE)




#-------------------------------------------------------------------
#Making the tables

#All races
mturk.all.table <- table(mturk.data$gender.f, mturk.data$younger.f)
cces.all.table <- table(cces.data$gender.f, cces.data$younger.f)

#White
mturk.white.table <- table(white.mturk.data$gender.f, white.mturk.data$younger.f)
cces.white.table <- table(white.cces.data$gender.f, white.cces.data$younger.f)

#Black
mturk.black.table <- table(black.mturk.data$gender.f, black.mturk.data$younger.f)
cces.black.table <- table(black.cces.data$gender.f, black.cces.data$younger.f)

#Hispanic
mturk.hispanic.table <- table(hispanic.mturk.data$gender.f, hispanic.mturk.data$younger.f)
cces.hispanic.table <- table(hispanic.cces.data$gender.f, hispanic.cces.data$younger.f)

#Asian
mturk.asian.table <- table(asian.mturk.data$gender.f, asian.mturk.data$younger.f)
cces.asian.table <- table(asian.cces.data$gender.f, asian.cces.data$younger.f)



#-------------------------------------------------------------------
#Using Mosaic Plot in graphics package

pdf("Figure2.pdf")
par(mfrow=c(5,2), mar=c(2,2,1,1), oma=c(1,1,0,1), cex.lab = 1.5, cex.main = 1)
mosaicplot(mturk.all.table, main = paste0("All Races (", nrow(mturk.data), " Respondents)"),
           color=TRUE, cex.axis = 1)
mosaicplot(cces.all.table, main = paste0("All Races (", nrow(cces.data), " Respondents)"),
           color=TRUE, cex.axis = 1)
mosaicplot(mturk.white.table, main = paste0("White (", nrow(white.mturk.data), " Respondents)"),
           color=TRUE, cex.axis = 1)
mosaicplot(cces.white.table, main = paste0("White (", nrow(white.cces.data), " Respondents)"),
           color=TRUE, cex.axis = 1)
mosaicplot(mturk.black.table, main = paste0("Black (", nrow(black.mturk.data), " Respondents)"),
           color=TRUE, cex.axis = 1)
mosaicplot(cces.black.table, main = paste0("Black (", nrow(black.cces.data), " Respondents)"),
           color=TRUE, cex.axis = 1)
mosaicplot(mturk.hispanic.table, main = paste0("Hispanic (", nrow(hispanic.mturk.data), " Respondents)"),
           color=TRUE, cex.axis = 1)
mosaicplot(cces.hispanic.table, main = paste0("Hispanic (", nrow(hispanic.cces.data), " Respondents)"),
           color=TRUE, cex.axis = 1)
mosaicplot(mturk.asian.table, main = paste0("Asian (", nrow(asian.mturk.data), " Respondents)"),
           xlab= "MTurk", color=TRUE, cex.axis = 1)
mosaicplot(cces.asian.table, main = paste0("Asian (", nrow(asian.cces.data), " Respondents)"),
           xlab= "CCES", color=TRUE, cex.axis = 1)
dev.off()


#=============================================================================================================================================
#============================================================Figure 3=========================================================================
#=============================================================================================================================================
#Using regression to calculate the difference between MTurk and CCES 


#Changing the "Not Sures" to NAs
#PID 7
vec <- data$pid7
vec[vec==8] <- NA
data$pid7 <- vec

#ideology
vec <- data$ideorate
vec[vec==8] <- NA
data$ideorate <- vec

#newsint
vec <- data$newsint
vec[vec==5] <- NA
data$newsint <- vec

#votereg
vec <- data$votereg
vec[vec==3] <- NA
data$votereg <- vec

#vote2012
vec <- data$vote2012
vec[vec==6] <- NA
data$vote2012 <- vec



#-------------------------------------------------------------------
#creating a vector of survey weights where mturk weights are set to be 1 (this will be used in Figure 4 but want to add this variable before splitting data)
survey.weights.all <- c(cces.data$survey.weights, rep(1, 2706))
data$survey.weights.all <- survey.weights.all

#Subsetting the data to younger and older based on the mean ages
data$younger <- as.numeric(data$birthyear >= mean(data$birthyear)) 
older.data <- data[data$younger==0,]
younger.data <- data[data$younger==1,]


#-------------------------------------------------------------------
#For loop that gives the differences in means with standard errors for variables in the young CCES and mTurk datasets
young.coef.bin <- NULL #This will store the coefficient estimates
young.se.bin <- NULL #This will store the standard errors


#for loop storing regression coefficients and standard errors
for(i in 1:14){
  lm <- (lm(younger.data[,i] ~ dataset, data=younger.data))
  young.coef.bin[i] <- summary(lm)$coef[2,1]
  young.se.bin[i] <- summary(lm)$coef[2,2]
}


#Creating a matrix with the variables, corresponding coefficient estimates, and standard errors
reg.young.data <- cbind(colnames(younger.data)[1:14], young.coef.bin, young.se.bin)



#-------------------------------------------------------------------
#Older CCES
old.coef.bin <- NULL #This will store the coefficient estimates
old.se.bin <- NULL #This will store the standard errors

#for loop storing regression coefficients and standard errors
for(i in 1:14){
  lm <- (lm(older.data[,i] ~ dataset, data=older.data))
  old.coef.bin[i] <- summary(lm)$coef[2,1]
  old.se.bin[i] <- summary(lm)$coef[2,2]
}

#Creating a matrix with the variables, corresponding coefficient estimates, and standard errors
reg.old.data <- cbind(colnames(older.data)[1:14], old.coef.bin, old.se.bin)



#-------------------------------------------------------------------
#creating a Dataframe with the variables for Figure 3. This will be run through ggplot
var.names <- c("PID 7", "Education", "News Interest", "Vote Registration", "Vote 2012", "Ideo 7")
allvarsplot.young.binary <- data.frame(Variable = var.names,
                                       Difference = c(as.numeric((reg.young.data)[12,2]), as.numeric((reg.young.data)[7,2]), 
                                                      as.numeric((reg.young.data)[13,2]),as.numeric((reg.young.data)[8,2]),
                                                      as.numeric((reg.young.data)[11,2]),as.numeric((reg.young.data)[10,2])),
                                       SE = c(as.numeric((reg.young.data)[12,3]), as.numeric((reg.young.data)[7,3]),
                                              as.numeric((reg.young.data)[13,3]),as.numeric((reg.young.data)[8,3]),
                                              as.numeric((reg.young.data)[11,3]),as.numeric((reg.young.data)[10,3])),
                                       modelName = "Young")


allvarsplot.old.binary <- data.frame(Variable = var.names,
                                     Difference = c(as.numeric((reg.old.data)[12,2]), as.numeric((reg.old.data)[7,2]), 
                                                    as.numeric((reg.old.data)[13,2]),as.numeric((reg.old.data)[8,2]),
                                                    as.numeric((reg.old.data)[11,2]),as.numeric((reg.old.data)[10,2])),
                                     SE = c(as.numeric((reg.old.data)[12,3]), as.numeric((reg.old.data)[7,3]),
                                            as.numeric((reg.old.data)[13,3]),as.numeric((reg.old.data)[8,3]),
                                            as.numeric((reg.old.data)[11,3]),as.numeric((reg.old.data)[10,3])),
                                     modelName = "Old")

# Combine the data frames
allvarsplot.allModelFrame <- data.frame(rbind(allvarsplot.young.binary, allvarsplot.old.binary)) # etc.

#naming column 4 of the dataframe "Age"
names(allvarsplot.allModelFrame)[4] <- "Age"

# Specify the width of the confidence intervals (95%)
interval <- -qnorm((1-0.95)/2) # 95% multiplier

# Plotting using ggplot
allvarsplot <- ggplot(allvarsplot.allModelFrame, aes(linetype = Age)) 
allvarsplot <- allvarsplot + geom_hline(yintercept = 0, colour = gray(1/2), lty = 2) #this adds the vertical line
allvarsplot <- allvarsplot + geom_linerange(aes(x = Variable, ymin = Difference - SE*interval,
                                                ymax = Difference + SE*interval),
                                            lwd = 1, position = position_dodge(width = 1/2))
allvarsplot <- allvarsplot + geom_pointrange(aes(x = Variable, y = Difference, ymin = Difference - SE*interval,
                                                 ymax = Difference + SE*interval),
                                             lwd = 1/2, position = position_dodge(width = 1/2),
                                             shape = 21, fill = "WHITE")
allvarsplot <- allvarsplot + coord_flip() + theme_bw()


allvarsplot <- ggplot(allvarsplot.allModelFrame, aes(linetype = Age)) +
  geom_hline(yintercept = 0, colour = gray(1/2), lty = 2) + #this adds the vertical line
  geom_linerange(aes(x = Variable, ymin = Difference - SE*interval, ymax = Difference + SE*interval),
                 lwd = 1, position = position_dodge(width = 1/2)) +
  geom_pointrange(aes(x = Variable, y = Difference, ymin = Difference - SE*interval, ymax = Difference + SE*interval),
                  lwd = 1/2, position = position_dodge(width = 1/2),
                  shape = 21, fill = "WHITE") +
  coord_flip() + 
  theme_bw() +
  ggtitle("")

pdf("Figure3.pdf")
print(allvarsplot)
dev.off()



#=============================================================================================================================================
#============================================================Figure 4=========================================================================
#=============================================================================================================================================
#===========================================================================================================================================
#Same as Figure 3 but using the weights from CCES

#-------------------------------------------------------------------
#For loops that calculate the differences in means with standard errors for variables in the CCES and mTurk datasets.
young.coef.bin <- NULL #This will store the coefficient estimates
young.se.bin <- NULL #This will store the standard errors

#The for loop
for(i in 1:14){
  lm <- (lm(younger.data[,i] ~ dataset, weights=survey.weights.all, data=younger.data))
  young.coef.bin[i] <- summary(lm)$coef[2,1]
  young.se.bin[i] <- summary(lm)$coef[2,2]
}

#Creating a matrix with the variables, corresponding coefficient estimates, and standard errors
reg.young.data <- cbind(colnames(younger.data)[1:14], young.coef.bin, young.se.bin)


#-------------------------------------------------------------------
#Older 
old.coef.bin <- NULL #This will store the coefficient estimates
old.se.bin <- NULL #This will store the standard errors

#for loop storing regression summary
for(i in 1:14){
  lm <- (lm(older.data[,i] ~ dataset, weights= survey.weights.all, data=older.data))
  old.coef.bin[i] <- summary(lm)$coef[2,1]
  old.se.bin[i] <- summary(lm)$coef[2,2]
}

#Creating a matrix with the variables, corresponding coefficient estimates, and standard errors
reg.old.data <- cbind(colnames(older.data)[1:14], old.coef.bin, old.se.bin)



#-------------------------------------------------------------------
var.names <- c("PID 7", "Education", "News Interest", "Vote Registration", "Vote 2012", "Ideo 7")
allvarsplot.young.binary <- data.frame(Variable = var.names,
                                       Difference = c(as.numeric((reg.young.data)[12,2]), as.numeric((reg.young.data)[7,2]), 
                                                      as.numeric((reg.young.data)[13,2]),as.numeric((reg.young.data)[8,2]),
                                                      as.numeric((reg.young.data)[11,2]),as.numeric((reg.young.data)[10,2])),
                                       SE = c(as.numeric((reg.young.data)[12,3]), as.numeric((reg.young.data)[7,3]),
                                              as.numeric((reg.young.data)[13,3]),as.numeric((reg.young.data)[8,3]),
                                              as.numeric((reg.young.data)[11,3]),as.numeric((reg.young.data)[10,3])),
                                       modelName = "Young")


allvarsplot.old.binary <- data.frame(Variable = var.names,
                                     Difference = c(as.numeric((reg.old.data)[12,2]), as.numeric((reg.old.data)[7,2]), 
                                                    as.numeric((reg.old.data)[13,2]),as.numeric((reg.old.data)[8,2]),
                                                    as.numeric((reg.old.data)[11,2]),as.numeric((reg.old.data)[10,2])),
                                     SE = c(as.numeric((reg.old.data)[12,3]), as.numeric((reg.old.data)[7,3]),
                                            as.numeric((reg.old.data)[13,3]),as.numeric((reg.old.data)[8,3]),
                                            as.numeric((reg.old.data)[11,3]),as.numeric((reg.old.data)[10,3])),
                                     modelName = "Old")

# Combine the data frames
allvarsplot.allModelFrame <- data.frame(rbind(allvarsplot.young.binary, allvarsplot.old.binary)) # etc.

names(allvarsplot.allModelFrame)[4] <- "Age"

# Specify the width of the confidence intervals (95%)
interval <- -qnorm((1-0.95)/2) # 95% multiplier

# Plotting using ggplot
allvarsplot <- ggplot(allvarsplot.allModelFrame, aes(linetype = Age)) 
allvarsplot <- allvarsplot + geom_hline(yintercept = 0, colour = gray(1/2), lty = 2) #this adds the vertical line
allvarsplot <- allvarsplot + geom_linerange(aes(x = Variable, ymin = Difference - SE*interval,
                                                ymax = Difference + SE*interval),
                                            lwd = 1, position = position_dodge(width = 1/2))
allvarsplot <- allvarsplot + geom_pointrange(aes(x = Variable, y = Difference, ymin = Difference - SE*interval,
                                                 ymax = Difference + SE*interval),
                                             lwd = 1/2, position = position_dodge(width = 1/2),
                                             shape = 21, fill = "WHITE")
allvarsplot <- allvarsplot + coord_flip() + theme_bw()


allvarsplot <- ggplot(allvarsplot.allModelFrame, aes(linetype = Age)) +
    geom_hline(yintercept = 0, colour = gray(1/2), lty = 2) + #this adds the vertical line
    geom_linerange(aes(x = Variable, ymin = Difference - SE*interval, ymax = Difference + SE*interval),
                                            lwd = 1, position = position_dodge(width = 1/2)) +
    geom_pointrange(aes(x = Variable, y = Difference, ymin = Difference - SE*interval, ymax = Difference + SE*interval),
                                             lwd = 1/2, position = position_dodge(width = 1/2),
                                             shape = 21, fill = "WHITE") +
   coord_flip() + 
   theme_bw() +
   ggtitle("")

pdf("Figure4.pdf")
print(allvarsplot)
dev.off()




#=============================================================================================================================================
#============================================================Figure 5=========================================================================
#=============================================================================================================================================
#===========================================================================================================================================
##Generating the maps for Figure 5

# modified from:
# http://stackoverflow.com/questions/13316185/r-convert-zipcode-or-lat-long-to-county

latlong2county <- function(pointsDF) {
  # The single argument to this function, pointsDF, is a data.frame in which:
  #   - column 1 contains the longitude in degrees (negative in the US)
  #   - column 2 contains the latitude in degrees
  
  # prepare SpatialPolygons object with one SpatialPolygon per county
  counties <- map("county", fill = TRUE, col = "transparent", plot = FALSE)
  IDs <- sapply(strsplit(counties$names, ":"), function(x) x[1])
  counties_sp <- map2SpatialPolygons(counties, IDs = IDs,
                                     proj4string = CRS("+proj=longlat +datum=WGS84"))
  
  # convert pointsDF to a SpatialPoints object 
  pointsSP <- SpatialPoints(pointsDF, proj4string = CRS("+proj=longlat +datum=WGS84"))
  
  # use "over" to get _indices_ of the Polygons object containing each point 
  indices <- over(pointsSP, counties_sp)
  
  # return the state and county names of the Polygons object containing each point
  stateCountyNames <- sapply(counties_sp@polygons, function(x) x@ID)
  
  # extract just the county names
  countyNames <- sapply(strsplit(stateCountyNames, ","), "[", 2)
  countyNames[indices]
}


#getting the zipcodes
votezip <- data$votezip


# zipcodes database
data(zipcode)

# map data
region_df <- map_data("state")
subregion_df <- map_data("county")


# load US cities data
data(us.cities)

##cities we want to plot
cityNames1 <- c("Montgomery", "Phoenix", "Little Rock", "Sacramento", "Denver", "Hartford", "Dover", "Tallahassee", "Atlanta", "Boise", "Springfield IL", 
                "Indianapolis","Des Moines", "Topeka", "Frankfort", "Baton Rouge", "Augusta ME", "Annapolis", "Boston", "Lansing", "Saint Paul", "Jackson MS", "Jefferson City",
                "Helena", "Lincoln NE", "Carson City", "Concord NH", "Trenton", "Santa Fe", "Albany NY", "Raleigh", "Bismarck", "Columbus OH", "Oklahoma City",
                "Salem OR", "Harrisburg", "Providence", "Columbia SC", "Pierre", "Nashville", "Austin", "Salt Lake City", "Montpelier", "Richmond VA", "Olympia",
                "Charleston WV", "Madison WI","Cheyenne")

cityNames2 <- c("El Paso TX", "Detroit MI", "Fort Worth TX", "Charlotte NC", #"Columbus", Columbus should be in here but it is in our vector of state capitals
                "San Francisco CA",
                "Indianapolis IN", "Jacksonville FL", 
                "Austin TX", "San Jose CA", 
                "Dallas TX", "San Diego CA", 
                "San Antonio TX", "New York NY", 
                "Phoenix AZ", "Philadelphia PA", "Houston TX", "Chicago IL", "Los Angeles CA", "Memphis TN")


cityNames <- c(cityNames1, cityNames2)

bad_cityNames <- c("Augusta-Richmond", "East", "Village", "East", "North", "South", "West")
UScity <- us.cities[grepl(paste0(cityNames, collapse = "|"), us.cities$name), ]
UScity <- UScity[!grepl(paste0(bad_cityNames, collapse = "|"), UScity$name), ]

#-------------------------------------------------------------------
# clean data

# subset votezip data into the two different datasets while
# omitting missing zipcodes and converting to characters
# so that 5 digits with leading zeros are maintained
mTurk_zipcode <- sprintf("%05d", na.omit( votezip[1301:4006] ) )

# convert state names (e.g., New York)  to state abbreviations (e.g., NY)
region_df$state <- state.abb[match(region_df$region, tolower(state.name))]
subregion_df$state <- state.abb[match(subregion_df$region, tolower(state.name))]

#-------------------------------------------------------------------
# get lat/long data for mTurk zipcodes
mTurkplotZip <- with(zipcode, zipcode[zip %in% mTurk_zipcode, ])

# exclude zipcodes in Alaska (AK) and Hawaii (HI)
mTurkplotZip <- mTurkplotZip[!(mTurkplotZip$state %in% c("AK", "HI")), ]

# get subregion location from lat/long mTurkplotZip
mTurkplotZip$subregion <- latlong2county(data.frame(mTurkplotZip[, "longitude"],
                                                    mTurkplotZip[, "latitude"]))


#-------------------------------------------------------------------
#Manually Inputing names for counties that are NA

##94607 with alameda
mTurkplotZip[mTurkplotZip$zip=="94607",6] <- "alameda"

##98640 with pacific
mTurkplotZip[mTurkplotZip$zip=="98640",6] <- "pacific"

##98422 with pierce
mTurkplotZip[mTurkplotZip$zip=="98422",6] <- "pierce"

##98407 with pierce
mTurkplotZip[mTurkplotZip$zip=="98407",6] <- "pierce"

##98349 with pierce
mTurkplotZip[mTurkplotZip$zip=="98349",6] <- "pierce"

##98282 with snohomish
mTurkplotZip[mTurkplotZip$zip=="98282",6] <- "snohomish"

##98199 with king
mTurkplotZip[mTurkplotZip$zip=="98199",6] <- "king"

##98136 with king
mTurkplotZip[mTurkplotZip$zip=="98136",6] <- "king"

##98117 with king
mTurkplotZip[mTurkplotZip$zip=="98117",6] <- "king"

##98110 with kitsap
mTurkplotZip[mTurkplotZip$zip=="98110",6] <- "kitsap"

##98026 with snohomish
mTurkplotZip[mTurkplotZip$zip=="98026",6] <- "snohomish"

##97365 with lincoln
mTurkplotZip[mTurkplotZip$zip=="97365",6] <- "lincoln"

##94607 with alameda
mTurkplotZip[mTurkplotZip$zip=="94607",6] <- "alameda"

##94132 with san francisco
mTurkplotZip[mTurkplotZip$zip=="94132",6] <- "san francisco"

##94122 with san francisco
mTurkplotZip[mTurkplotZip$zip=="94122",6] <- "san francisco"

##94120 with san francisco
mTurkplotZip[mTurkplotZip$zip=="94120",6] <- "san francisco"

##94019 with san mateo
mTurkplotZip[mTurkplotZip$zip=="94019",6] <- "san mateo"

##92109 with san diego
mTurkplotZip[mTurkplotZip$zip=="92109",6] <- "san diego"

##92101 with san diego
mTurkplotZip[mTurkplotZip$zip=="92101",6] <- "san diego"

##92014 with san diego
mTurkplotZip[mTurkplotZip$zip=="92014",6] <- "san diego"

##90405 with los angeles
mTurkplotZip[mTurkplotZip$zip=="90405",6] <- "los angeles"

##90292 with los angeles 
mTurkplotZip[mTurkplotZip$zip=="90292",6] <- "los angeles"

##90291 with los angeles
mTurkplotZip[mTurkplotZip$zip=="90291",6] <- "los angeles"

##77550 with galveston
mTurkplotZip[mTurkplotZip$zip=="77550",6] <- "galveston"

##49453 with allegan
mTurkplotZip[mTurkplotZip$zip=="49453",6] <- "allegan"

##48226 with wayne
mTurkplotZip[mTurkplotZip$zip=="48226",6] <- "wayne"

##48029 with saint clair
mTurkplotZip[mTurkplotZip$zip=="48029",6] <- "saint clair"

##48060 with saint clair
mTurkplotZip[mTurkplotZip$zip=="48060",6] <- "saint clair"

##36604 with mobile
mTurkplotZip[mTurkplotZip$zip=="36604",6] <- "mobile"

##34228 with manatee
mTurkplotZip[mTurkplotZip$zip=="34228",6] <- "manatee"

##32548 with okaloosa
mTurkplotZip[mTurkplotZip$zip=="32548",6] <- "okaloosa"

##32541 with okaloosa
mTurkplotZip[mTurkplotZip$zip=="32541",6] <- "okaloosa"

##32176 with volusia
mTurkplotZip[mTurkplotZip$zip=="32176",6] <- "volusia"

##11569 with nassau
mTurkplotZip[mTurkplotZip$zip=="11569",6] <- "nassau"

##11228 with brooklyn
mTurkplotZip[mTurkplotZip$zip=="11228",6] <- "brooklyn"

##10002 with new york
mTurkplotZip[mTurkplotZip$zip=="10002",6] <- "new york"

##08401 with atlantic
mTurkplotZip[mTurkplotZip$zip=="08401",6] <- "atlantic"

##07047 with hudson
mTurkplotZip[mTurkplotZip$zip=="07047",6] <- "hudson"

##07047 with hudson
mTurkplotZip[mTurkplotZip$zip=="07047",6] <- "hudson"


# calculate proportion of mTurk zip codes in each state (out of entire USA)
mTurkplotZip_stateProp <- ddply(mTurkplotZip, .(state), summarize,
                                mTurk_stateProp = length(zip) / nrow(mTurkplotZip))

# calculate proportion of mTurk zip codes in each county (out of entire USA)
mTurkplotZip_countyProp <- ddply(mTurkplotZip, .(state, subregion), summarize,
                                 mTurk_countyProp = length(zip) / nrow(mTurkplotZip)) 





#-------------------------------------------------------------------
# merge attribute and polygon data

# merge zip proportion data to state data
plot_state_data <- merge(region_df, mTurkplotZip_stateProp, by = "state", all = TRUE)

# excluse NAs from state variable
plot_state_data <- plot_state_data[!is.na(plot_state_data$state), ] 
# sort the state plotting order 
plot_state_data <- plot_state_data[order(plot_state_data$order), ]


# merge zip proportion data to county data
plot_county_data <- merge(mTurkplotZip_countyProp, subregion_df, by = c("state", "subregion"), all = TRUE)
# excluse NAs from subregion variable
plot_county_data <- plot_county_data[!is.na(plot_county_data$subregion), ]  
# sort the county plotting order 
plot_county_data <- plot_county_data[order(plot_county_data$order), ]
# omit NAs for state and region
plot_county_data <- with(plot_county_data, plot_county_data[!is.na(state) & !is.na(region), ])


#-------------------------------------------------------------------
# ggplot themes

# no gridlines or axis text
theme_set(theme_bw())
theme_blank <- theme_update(
  axis.line = element_blank(), 
  axis.text.x = element_blank(), 
  axis.text.y = element_blank(),
  axis.ticks = element_blank(), 
  axis.title.x = element_blank(), 
  axis.title.y = element_blank(), 
  axis.ticks.length = unit(.1, "lines"), 
  axis.ticks.margin = unit(.1, "lines"), 
  panel.background = element_blank(), 
  panel.border = element_blank(), 
  panel.grid.major = element_blank(), 
  panel.grid.minor = element_blank(), 
  panel.margin = unit(.1, "lines"), 
  plot.background = element_blank(), 
  plot.margin = unit(c(-.5, -.4, -1, -.4), "lines"), # top, right, bottom, left
  strip.background = element_blank(),
  strip.text.x = element_text(vjust = -0.4),
  legend.position = "none", # scale position
  legend.title = element_blank()
)


# ================================================================================
# plot county level data

#-------------------------------------------------------------------
# utility functions for county-level data

# extract the legend from the first plot
g_legend <- function(a.gplot) {
  tmp <- ggplot_gtable(ggplot_build(a.gplot)) 
  legend <- tmp$grobs[[ which(sapply(tmp$grobs, function(x) x$name) == "guide-box") ]]
  return(legend)
}

# arrange plots in a grid
arrange.plots <- function(plotList, leg.scale, main.title, nc = 7) {
  legend <- g_legend(leg.scale) # extract legend information
  legend_height <- sum(legend$height)
  plotGrid <- arrangeGrob( # arrange plots and legend in grid
    do.call(arrangeGrob, c(plotList, ncol = nc) ),
    legend, 
    heights = unit.c(unit(0.97, "npc") - legend_height, legend_height),
    main = textGrob(main.title, gp = gpar(fontsize = 12, font = 2)))
  return(plotGrid)
}

#-------------------------------------------------------------------
# legend scale for entire USA

# base plot for legend scale
legendScale <- ggplot(plot_county_data,
                      aes(x = long, y = lat, group = subregion, fill = mTurk_countyProp)) +
  geom_polygon(color = alpha("white", 0.7), size = 0.03) +
  theme(legend.position = "bottom",
        legend.text = element_text(size = 9)) 

#-------------------------------------------------------------------
# calculate min and max proportion by state to get range for color scale
mTurkPropRange <- ddply(plot_county_data, .(state), summarize,
                        min = min(mTurk_countyProp, na.rm = TRUE),
                        max = max(mTurk_countyProp, na.rm = TRUE))
# calculate min and max proportions for whole USA to get range for color scale
mTurkRange <- data.frame(min = min(mTurkPropRange$min),
                         max = max(mTurkPropRange$max))

# add mTurk specific scale to base plot of legend
mTurkScale <- legendScale +
  scale_fill_gradient(guide = guide_colorbar(barwidth = 20, barheight = 0.8,
                                             label.position = "bottom"),
                      limits = c(mTurkRange$min*0.99, mTurkRange$max*1.01),
                      labels = percent_format())


#-------------------------------------------------------------------
# create a vector of the states we want to keep in the order we want to plotted them
keep_states <- na.omit(unique(plot_county_data$region)) # all states

#-------------------------------------------------------------------
# initilaize mTurk list
mTurk_maps <- vector(mode = "list", length = length(keep_states))

# loop through mTurk plots
for (i in seq_along(keep_states)) {
  
  mTurk_maps[[i]] <- ggplot(plot_county_data[plot_county_data$region == keep_states[i], ],
                            aes(x = long, y = lat, group = subregion, fill = mTurk_countyProp)) +
    geom_polygon(color = alpha("white", 0.7), size = 0.03) +
    geom_point(data = UScity[UScity$country.etc == state.abb[match(keep_states[i], tolower(state.name))], ],
               aes(long, lat, fill = NULL, group = NULL), size = 2, color = "red") +
    coord_map(project = "albers", at0 = 45.5, lat1 = 29.5) +
    facet_wrap(~ region) +
    scale_fill_gradient(guide = guide_colorbar(barwidth = 0.5, barheight = 7),
                        limits = c(mTurkRange$min*0.99, mTurkRange$max*1.01),
                        labels = percent_format()) +
    labs(x = "Longitude", y = "Latitude") 
  
}

#-------------------------------------------------------------------
# arrange plots and legend
mTurk_all_states <- arrange.plots(plotList = mTurk_maps, leg.scale = mTurkScale,
                                  main.title = "")

ggsave(mTurk_all_states, file = "Figure5.pdf", width = 16, height = 11)





#=============================================================================================================================================
#============================================================Figure 6=========================================================================
#=============================================================================================================================================
#===========================================================================================================================================
##Loading the master zip code file from the larger pool of mturkers
load("Mturk_Master_Zips.Rdata")

mturk.master.zipcodes <- as.numeric(mturk.master.zipcodes)



# subset cities data
UScity <- us.cities[grepl(paste0(cityNames, collapse = "|"), us.cities$name), ]
UScity <- UScity[!grepl(paste0(bad_cityNames, collapse = "|"), UScity$name), ]

#-------------------------------------------------------------------
# clean data

# subset votezip data into the two different datasets while
# omitting missing zipcodes and converting to characters
# so that 5 digits with leading zeros are maintained
mTurk_zipcode <- sprintf("%05d", na.omit( mturk.master.zipcodes ) )

# convert state names (e.g., New York)  to state abbreviations (e.g., NY)
region_df$state <- state.abb[match(region_df$region, tolower(state.name))]
subregion_df$state <- state.abb[match(subregion_df$region, tolower(state.name))]

#-------------------------------------------------------------------
# mTurk

# get lat/long data for mTurk zipcodes
mTurkplotZip <- with(zipcode, zipcode[zip %in% mTurk_zipcode, ])

# exclude zipcodes in Alaska (AK) and Hawaii (HI)
mTurkplotZip <- mTurkplotZip[!(mTurkplotZip$state %in% c("AK", "HI", "PR", "DC")), ]

mTurkplotZip <- na.omit(mTurkplotZip)
#omitting locitations with longitude/latitude missing


# get subregion location from lat/long mTurkplotZip
mTurkplotZip$subregion <- latlong2county(data.frame(mTurkplotZip[, "longitude"],
                                                    mTurkplotZip[, "latitude"]))


# calculate proportion of mTurk zip codes in each state (out of entire USA)
mTurkplotZip_stateProp <- ddply(mTurkplotZip, .(state), summarize,
                                mTurk_stateProp = length(zip) / nrow(mTurkplotZip))

# calculate proportion of mTurk zip codes in each county (out of entire USA)
mTurkplotZip_countyProp <- ddply(mTurkplotZip, .(state, subregion), summarize,
                                 mTurk_countyProp = length(zip) / nrow(mTurkplotZip)) 


#-------------------------------------------------------------------
# merge attribute and polygon data

# merge zip proportion data to state data
plot_state_data <- merge(region_df, mTurkplotZip_stateProp, by = "state", all = TRUE)
# excluse NAs from state variable
plot_state_data <- plot_state_data[!is.na(plot_state_data$state), ] 
# sort the state plotting order 
plot_state_data <- plot_state_data[order(plot_state_data$order), ]


# merge zip proportion data to county data
plot_county_data <- merge(mTurkplotZip_countyProp, subregion_df, by = c("state", "subregion"), all = TRUE)
# excluse NAs from subregion variable
plot_county_data <- plot_county_data[!is.na(plot_county_data$subregion), ]  
# sort the county plotting order 
plot_county_data <- plot_county_data[order(plot_county_data$order), ]
# omit NAs for state and region
plot_county_data <- with(plot_county_data, plot_county_data[!is.na(state) & !is.na(region), ])


#-------------------------------------------------------------------
# ggplot themes

# no gridlines or axis text
theme_set(theme_bw())
theme_blank <- theme_update(
  axis.line = element_blank(), 
  axis.text.x = element_blank(), 
  axis.text.y = element_blank(),
  axis.ticks = element_blank(), 
  axis.title.x = element_blank(), 
  axis.title.y = element_blank(), 
  axis.ticks.length = unit(.1, "lines"), 
  axis.ticks.margin = unit(.1, "lines"), 
  panel.background = element_blank(), 
  panel.border = element_blank(), 
  panel.grid.major = element_blank(), 
  panel.grid.minor = element_blank(), 
  panel.margin = unit(.1, "lines"), 
  plot.background = element_blank(), 
  plot.margin = unit(c(-.5, -.4, -1, -.4), "lines"), # top, right, bottom, left
  strip.background = element_blank(),
  strip.text.x = element_text(vjust = -0.4),
  legend.position = "none", # scale position
  legend.title = element_blank()
)




#-------------------------------------------------------------------
# utility functions for county-level data

# extract the legend from the first plot
g_legend <- function(a.gplot) {
  tmp <- ggplot_gtable(ggplot_build(a.gplot)) 
  legend <- tmp$grobs[[ which(sapply(tmp$grobs, function(x) x$name) == "guide-box") ]]
  return(legend)
}

# arrange plots in a grid
arrange.plots <- function(plotList, leg.scale, main.title, nc = 7) {
  legend <- g_legend(leg.scale) # extract legend information
  legend_height <- sum(legend$height)
  plotGrid <- arrangeGrob( # arrange plots and legend in grid
    do.call(arrangeGrob, c(plotList, ncol = nc) ),
    legend, 
    heights = unit.c(unit(0.97, "npc") - legend_height, legend_height),
    main = textGrob(main.title, gp = gpar(fontsize = 12, font = 2)))
  return(plotGrid)
}

#-------------------------------------------------------------------
# legend scale for entire USA

# base plot for legend scale
legendScale <- ggplot(plot_county_data,
                      aes(x = long, y = lat, group = subregion, fill = mTurk_countyProp)) +
  geom_polygon(color = alpha("white", 0.7), size = 0.03) +
  theme(legend.position = "bottom",
        legend.text = element_text(size = 9)) 

#-------------------------------------------------------------------
# calculate min and max proportion by state to get range for color scale
mTurkPropRange <- ddply(plot_county_data, .(state), summarize,
                        min = min(mTurk_countyProp, na.rm = TRUE),
                        max = max(mTurk_countyProp, na.rm = TRUE))
# calculate min and max proportions for whole USA to get range for color scale
mTurkRange <- data.frame(min = min(mTurkPropRange$min),
                         max = max(mTurkPropRange$max))

# add mTurk specific scale to base plot of legend
mTurkScale <- legendScale +
  scale_fill_gradient(guide = guide_colorbar(barwidth = 20, barheight = 0.8,
                                             label.position = "bottom"),
                      limits = c(mTurkRange$min*0.99, mTurkRange$max*1.01),
                      labels = percent_format())

#-------------------------------------------------------------------
# create a vector of the states we want to keep in the order we want to plotted them
keep_states <- na.omit(unique(plot_county_data$region)) # all states

#-------------------------------------------------------------------
# initilaize mTurk list
mTurk_maps <- vector(mode = "list", length = length(keep_states))

# loop through mTurk plots
for (i in seq_along(keep_states)) {
  
  mTurk_maps[[i]] <- ggplot(plot_county_data[plot_county_data$region == keep_states[i], ],
                            aes(x = long, y = lat, group = subregion, fill = mTurk_countyProp)) +
    geom_polygon(color = alpha("white", 0.7), size = 0.03) +
    geom_point(data = UScity[UScity$country.etc == state.abb[match(keep_states[i], tolower(state.name))], ],
               aes(long, lat, fill = NULL, group = NULL), size = 2, color = "red") +
    coord_map(project = "albers", at0 = 45.5, lat1 = 29.5) +
    facet_wrap(~ region) +
    scale_fill_gradient(guide = guide_colorbar(barwidth = 0.5, barheight = 7),
                        limits = c(mTurkRange$min*0.99, mTurkRange$max*1.01),
                        labels = percent_format()) +
    labs(x = "Longitude", y = "Latitude") 
  
}



mTurk_all_states_large <- arrange.plots(plotList = mTurk_maps, leg.scale = mTurkScale,
                                        main.title = "mTurk")

# save composite plot
ggsave(mTurk_all_states_large, file = "Figure6.pdf", width = 16, height = 11)



#=============================================================================================================================================
#============================================================Table 1=========================================================================
#=============================================================================================================================================
#===========================================================================================================================================
#Generating the occupations of workers by category for Table 1
cces.occ <- table(cces.data$occupation)/nrow(subset(cces.data, cces.data$occupation!="NA"))
mturk.occ <- table(mturk.data$occupation)/nrow(subset(mturk.data, mturk.data$occupation!="NA"))

occ.df <- as.data.frame(cbind(cces.occ*100, mturk.occ*100))
names(occ.df) <- c("CCES%", "MTurk%")
rownames(occ.df) <- c("Management", "Independent Contractor", "Business Owner", "Owner-Operator", "Administrative Support", "Healthcare Support",
                      "Protective Service", "Food Prep", "Personal Care", "Maintenance and Repair", "Ground Cleaning and Maintenance", "Other Service",
                      "Trade Worker", "Professional")
occ.df





#=============================================================================================================================================
#============================================================Table 2=========================================================================
#=============================================================================================================================================
#===========================================================================================================================================
#Using FIPS data and a master dataset of zip codes as specified in the paper to generate the number of individuals in urban/rural areas

#the geo data is all the unique FIPS codes in the us
ru.data <- read.csv("RuralUrbanCodes.csv")

#This dataset corresponds to the number of zip codes
geo.data <- read.csv("geocorr12.csv", header=TRUE)


CCES_zipcode <- sprintf("%05d", na.omit( votezip[1:1300] ) )
mTurk_zipcode <- sprintf("%05d", na.omit( votezip[1301:4006] ) )


#Agregating the CCES data to zip code level 
cces.zip.df <- as.data.frame(table(CCES_zipcode))
mturk.zip.df <- as.data.frame(table(mTurk_zipcode))


#getting rid of the first row in the geo.data
geo.data <- geo.data[-1,]

#Converting the weight from factor to numeric
geo.data$afact <- as.numeric(as.character(geo.data$afact))


#Getting rid of commas in population numbers for Rural/Urban data
ru.data$Population_2010 <- as.numeric(gsub(",","", ru.data$Population_2010))


#adding a leading zero to the start of the Rural/Urban FIPS codes so that they can be properly merged with the other datasets
ru.data$FIPS <- sprintf("%05d", ru.data$FIPS)


#Merging the Geo Data with Harvard/CCESnceton CCES MTurk
geo.cces.data <- merge(geo.data, cces.zip.df, by.x="zcta5", by.y="CCES_zipcode", all=TRUE)
geo.mturk.data <- merge(geo.data, mturk.zip.df, by.x="zcta5", by.y="mTurk_zipcode", all=TRUE)


#Making a new variable which multiplies the zip code by the weight from geo data
geo.cces.data$weighted.freq <- geo.cces.data$afact*geo.cces.data$Freq

geo.mturk.data$weighted.freq <- geo.mturk.data$afact*geo.mturk.data$Freq



#summing to the FIPS level
cces.fips <- ddply(geo.cces.data, .(county), summarize, fips.freq = sum(weighted.freq, na.rm=T))
mturk.fips <- ddply(geo.mturk.data, .(county), summarize, fips.freq = sum(weighted.freq, na.rm=T))


#Merging the summed FIPS data with the rural urban data
cces.final.df <- merge(cces.fips, ru.data, by.x="county", by.y="FIPS", all=TRUE)
mturk.final.df <- merge(mturk.fips, ru.data, by.x="county", by.y="FIPS", all=TRUE)



#Aggregating to the rural/Urban using FIPS
#creating a new variable which will be the denominator
cces.total <- sum(cces.final.df$fips.freq, na.rm=T)
mturk.total <- sum(mturk.final.df$fips.freq, na.rm=T)

cces.ru.df <- ddply(cces.final.df, .(RUCC_2013), summarize, rural.freq = (sum(fips.freq, na.rm=T)/cces.total)*100)
mturk.ru.df <- ddply(mturk.final.df, .(RUCC_2013), summarize, rural.freq = (sum(fips.freq, na.rm=T)/mturk.total)*100)

merged.df <- merge(cces.ru.df, mturk.ru.df, by="RUCC_2013")

names(merged.df) <- c("Urban/Rural Code", "CCES", "MTurk")
xtable(merged.df)



